Autor: Nils Verheyen\ Matrikelnummer: 3043171


In [2]:
# DO NOT EDIT THIS CELL

print("importing...")
import csv
import random
import math
import operator
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
print("done")


importing...
done

In [3]:
#Input data
data_set = 'students.data'

In [4]:
def loadDataset(filename, split, trainingSet=[] , testSet=[]):
    with open(filename, 'r') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
        for x in range(len(dataset) - 1):
            for y in range(len(dataset[0]) - 1):
                dataset[x][y] = float(dataset[x][y])
            if random.random() < split:
                trainingSet.append(dataset[x])
            else:
                testSet.append(dataset[x])

Implement the following functions

  1. euclideanDistance
  2. getNeighbors
  3. estimateType

In [5]:
import numpy as np

In [6]:
train = []
test = []

loadDataset(data_set, .75, train, test)

print('train sample', end='\n\n')
for i in range(10):
    print(train[i])
    
    
print('test sample', end='\n\n')
for i in range(10):
    print(test[i])


train sample

[53.0, 114.0, 'passes']
[83.0, 121.0, 'passes']
[94.0, 116.0, 'passes']
[95.0, 78.0, 'passes']
[55.0, 129.0, 'passes']
[94.0, 87.0, 'passes']
[79.0, 85.0, 'passes']
[67.0, 99.0, 'passes']
[54.0, 100.0, 'passes']
[72.0, 124.0, 'passes']
test sample

[87.0, 117.0, 'passes']
[82.0, 122.0, 'passes']
[59.0, 92.0, 'passes']
[50.0, 105.0, 'passes']
[6.0, 83.0, 'fails']
[6.0, 80.0, 'fails']
[93.0, 130.0, 'passes']
[31.0, 123.0, 'passes']
[60.0, 114.0, 'passes']
[65.0, 84.0, 'passes']

In [7]:
def plot_data(data_sample):
    
    types = set([t[2] for t in data_sample])
    for t in types:
        df = [row for row in data_sample if row[2] == t]
        xs = [row[0] for row in df]
        ys = [row[1] for row in df]
        plt.scatter(xs, ys)
    plt.show()

print("Training data")
plot_data(train)

print("Test data")
plot_data(test)


Training data
Test data

In [8]:
# TODO Task 4B.1

def euclideanDistance(sample1, sample2):
    p1 = sample1[:2]
    p2 = sample2[:2]
    distance = math.sqrt(sum([(a - b) ** 2 for a, b in zip(p1, p2)]))
    return distance

print(train[0])
print(train[1])

euclideanDistance(train[0], train[1])


[53.0, 114.0, 'passes']
[83.0, 121.0, 'passes']
Out[8]:
30.805843601498726

In [9]:
# TODO Task 4B.2

def getNeighbors(trainingSet, testSample):
    
    k = 3
    
    #TODO: calcualte the euclidean distance to the neighbors.
    #Then return the k-nearest.
    
    neighbors = []
    for s in trainingSet:
        distance = euclideanDistance(s, testSample)
        neighbors.append((s, distance))
    nearest = sorted(neighbors, key=lambda t: t[1])
    
    # return with distance
    return nearest[:k]

sample = test[0]
neighbors = getNeighbors(train, sample)

print(sample)
for n in neighbors:
    print(n)


[87.0, 117.0, 'passes']
([87.0, 116.0, 'passes'], 1.0)
([86.0, 118.0, 'passes'], 1.4142135623730951)
([85.0, 118.0, 'passes'], 2.23606797749979)

In [10]:
# TODO Task 4B.3


def estimateType(neighbors):
    
    #TODO: determine the type of the current sample
    #by checking the neighbors and return it.
    
    types = dict()
    for key in set([t[2] for t in train]):
        types[key] = dict(count=0, distance=0)
    # types: 
    # {
    #     'passed': {
    #         'count': 0
    #         'distance': 0
    #     },
    #     ...
    # }
    
    for n in neighbors:
        key = n[0][2]
        types[key]['count'] += 1
        types[key]['distance'] += n[1]
        
    # max count of all types
    max_val = max(types.items(), key = lambda x: x[1]['count'])[1]['count']
    # may contain multiple types in case of even k (eg. fails=2, passed=2)
    targets = [t for t in types.items() if t[1]['count'] == max_val]
    
    # return the first type if only one type is inside the list
    if len(targets) == 1:
        return targets[0][0]
    
    # otherwise sort by distance and return the type with the lowest distance
    targets = sorted(targets, key=lambda t: t[1]['distance'])
    return targets[0]

estimateType(neighbors)


Out[10]:
'passes'

In [11]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [12]:
from datetime import datetime
start = datetime.now()

predictions = [estimateType(getNeighbors(train, t)) for t in test]

end = datetime.now()
print('Took %s to complete' % str(end - start))

print('Accuracy: %f' % getAccuracy(test, predictions))


Took 0:00:00.466882 to complete
Accuracy: 97.244094

Take a look on the data to get an impression of what's going on.

You should see two clusters of students.


In [13]:
xs = []
ys = []
zs = []
cs = []

fig = plt.figure(figsize=(25, 25))

plot_2d = fig.add_subplot(441)

with open(data_set, 'r') as csvfile:
    lines = csv.reader(csvfile)
    data = list(lines)
    for d in data:
        xs.append(int(d[0]))
        ys.append(int(d[1]))
        cs.append('r' if d[2] == "fails" else "g")

plot_2d.scatter(xs, ys, c=cs, marker='o')
plot_2d.set_xlabel('X: Student learned')
plot_2d.set_ylabel('Y: Student IQ')

plt.show()



In [14]:
trainingSet = []
testSet = []
split = 0.67

loadDataset(data_set, split, trainingSet, testSet)
print ('Train set: ' + repr(len(trainingSet)))
print ('Test set: ' + repr(len(testSet)))

predictions=[]

#This is k 
count = 0
for x in range(len(testSet)):
    neighbors = getNeighbors(trainingSet, testSet[x])
    result = estimateType(neighbors)
    predictions.append(result)
    print(str(count) + ': predicted=' + repr(result) + ', actual=' + repr(testSet[x][-1]))
    count += 1
    
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: ' + repr(accuracy) + '%')


Train set: 666
Test set: 333
0: predicted='passes', actual='passes'
1: predicted='passes', actual='passes'
2: predicted='passes', actual='passes'
3: predicted='passes', actual='passes'
4: predicted='passes', actual='passes'
5: predicted='passes', actual='passes'
6: predicted='passes', actual='passes'
7: predicted='passes', actual='passes'
8: predicted='passes', actual='passes'
9: predicted='fails', actual='fails'
10: predicted='fails', actual='fails'
11: predicted='fails', actual='passes'
12: predicted='fails', actual='fails'
13: predicted='passes', actual='passes'
14: predicted='fails', actual='fails'
15: predicted='passes', actual='passes'
16: predicted='passes', actual='passes'
17: predicted='fails', actual='fails'
18: predicted='fails', actual='fails'
19: predicted='fails', actual='fails'
20: predicted='fails', actual='fails'
21: predicted='fails', actual='fails'
22: predicted='passes', actual='passes'
23: predicted='fails', actual='fails'
24: predicted='fails', actual='fails'
25: predicted='fails', actual='fails'
26: predicted='fails', actual='fails'
27: predicted='passes', actual='passes'
28: predicted='fails', actual='fails'
29: predicted='fails', actual='fails'
30: predicted='fails', actual='passes'
31: predicted='passes', actual='passes'
32: predicted='passes', actual='passes'
33: predicted='passes', actual='passes'
34: predicted='fails', actual='fails'
35: predicted='fails', actual='fails'
36: predicted='passes', actual='passes'
37: predicted='passes', actual='passes'
38: predicted='passes', actual='passes'
39: predicted='fails', actual='fails'
40: predicted='fails', actual='fails'
41: predicted='passes', actual='passes'
42: predicted='fails', actual='fails'
43: predicted='passes', actual='passes'
44: predicted='passes', actual='passes'
45: predicted='fails', actual='fails'
46: predicted='fails', actual='fails'
47: predicted='fails', actual='fails'
48: predicted='passes', actual='passes'
49: predicted='fails', actual='fails'
50: predicted='passes', actual='passes'
51: predicted='passes', actual='passes'
52: predicted='passes', actual='fails'
53: predicted='passes', actual='passes'
54: predicted='passes', actual='passes'
55: predicted='fails', actual='fails'
56: predicted='fails', actual='fails'
57: predicted='passes', actual='passes'
58: predicted='passes', actual='passes'
59: predicted='fails', actual='fails'
60: predicted='fails', actual='fails'
61: predicted='passes', actual='passes'
62: predicted='passes', actual='passes'
63: predicted='passes', actual='passes'
64: predicted='passes', actual='passes'
65: predicted='fails', actual='fails'
66: predicted='passes', actual='passes'
67: predicted='passes', actual='passes'
68: predicted='passes', actual='passes'
69: predicted='passes', actual='passes'
70: predicted='fails', actual='fails'
71: predicted='fails', actual='fails'
72: predicted='passes', actual='passes'
73: predicted='passes', actual='passes'
74: predicted='passes', actual='passes'
75: predicted='fails', actual='passes'
76: predicted='passes', actual='passes'
77: predicted='passes', actual='passes'
78: predicted='passes', actual='passes'
79: predicted='passes', actual='passes'
80: predicted='passes', actual='passes'
81: predicted='fails', actual='fails'
82: predicted='passes', actual='passes'
83: predicted='passes', actual='passes'
84: predicted='passes', actual='passes'
85: predicted='passes', actual='passes'
86: predicted='passes', actual='passes'
87: predicted='passes', actual='fails'
88: predicted='passes', actual='passes'
89: predicted='passes', actual='passes'
90: predicted='fails', actual='fails'
91: predicted='fails', actual='fails'
92: predicted='fails', actual='fails'
93: predicted='passes', actual='passes'
94: predicted='fails', actual='fails'
95: predicted='passes', actual='passes'
96: predicted='passes', actual='passes'
97: predicted='fails', actual='fails'
98: predicted='passes', actual='passes'
99: predicted='passes', actual='passes'
100: predicted='passes', actual='passes'
101: predicted='passes', actual='passes'
102: predicted='fails', actual='fails'
103: predicted='passes', actual='passes'
104: predicted='passes', actual='passes'
105: predicted='passes', actual='passes'
106: predicted='passes', actual='passes'
107: predicted='passes', actual='passes'
108: predicted='passes', actual='passes'
109: predicted='fails', actual='fails'
110: predicted='fails', actual='fails'
111: predicted='fails', actual='fails'
112: predicted='passes', actual='passes'
113: predicted='passes', actual='passes'
114: predicted='passes', actual='passes'
115: predicted='fails', actual='fails'
116: predicted='passes', actual='passes'
117: predicted='fails', actual='fails'
118: predicted='passes', actual='passes'
119: predicted='passes', actual='passes'
120: predicted='passes', actual='passes'
121: predicted='passes', actual='passes'
122: predicted='passes', actual='passes'
123: predicted='passes', actual='passes'
124: predicted='passes', actual='passes'
125: predicted='passes', actual='passes'
126: predicted='fails', actual='fails'
127: predicted='passes', actual='passes'
128: predicted='passes', actual='passes'
129: predicted='fails', actual='fails'
130: predicted='fails', actual='fails'
131: predicted='passes', actual='passes'
132: predicted='fails', actual='fails'
133: predicted='passes', actual='passes'
134: predicted='passes', actual='passes'
135: predicted='passes', actual='passes'
136: predicted='passes', actual='passes'
137: predicted='passes', actual='passes'
138: predicted='passes', actual='passes'
139: predicted='fails', actual='fails'
140: predicted='passes', actual='passes'
141: predicted='fails', actual='fails'
142: predicted='passes', actual='passes'
143: predicted='passes', actual='passes'
144: predicted='fails', actual='fails'
145: predicted='passes', actual='passes'
146: predicted='fails', actual='fails'
147: predicted='passes', actual='passes'
148: predicted='passes', actual='passes'
149: predicted='passes', actual='passes'
150: predicted='fails', actual='fails'
151: predicted='fails', actual='fails'
152: predicted='passes', actual='passes'
153: predicted='passes', actual='passes'
154: predicted='passes', actual='passes'
155: predicted='passes', actual='passes'
156: predicted='fails', actual='fails'
157: predicted='fails', actual='fails'
158: predicted='passes', actual='passes'
159: predicted='passes', actual='passes'
160: predicted='passes', actual='passes'
161: predicted='passes', actual='passes'
162: predicted='passes', actual='passes'
163: predicted='passes', actual='passes'
164: predicted='fails', actual='fails'
165: predicted='fails', actual='fails'
166: predicted='fails', actual='fails'
167: predicted='passes', actual='passes'
168: predicted='passes', actual='passes'
169: predicted='fails', actual='fails'
170: predicted='fails', actual='fails'
171: predicted='passes', actual='passes'
172: predicted='passes', actual='passes'
173: predicted='passes', actual='passes'
174: predicted='passes', actual='passes'
175: predicted='fails', actual='passes'
176: predicted='fails', actual='fails'
177: predicted='fails', actual='fails'
178: predicted='passes', actual='passes'
179: predicted='passes', actual='passes'
180: predicted='fails', actual='fails'
181: predicted='fails', actual='fails'
182: predicted='passes', actual='passes'
183: predicted='passes', actual='passes'
184: predicted='fails', actual='fails'
185: predicted='fails', actual='fails'
186: predicted='passes', actual='passes'
187: predicted='passes', actual='passes'
188: predicted='passes', actual='passes'
189: predicted='passes', actual='passes'
190: predicted='fails', actual='fails'
191: predicted='fails', actual='fails'
192: predicted='passes', actual='passes'
193: predicted='passes', actual='passes'
194: predicted='fails', actual='fails'
195: predicted='passes', actual='passes'
196: predicted='fails', actual='fails'
197: predicted='passes', actual='passes'
198: predicted='fails', actual='fails'
199: predicted='fails', actual='fails'
200: predicted='passes', actual='passes'
201: predicted='passes', actual='passes'
202: predicted='passes', actual='passes'
203: predicted='fails', actual='fails'
204: predicted='passes', actual='passes'
205: predicted='passes', actual='passes'
206: predicted='passes', actual='passes'
207: predicted='passes', actual='passes'
208: predicted='passes', actual='passes'
209: predicted='passes', actual='passes'
210: predicted='fails', actual='fails'
211: predicted='passes', actual='passes'
212: predicted='fails', actual='fails'
213: predicted='fails', actual='fails'
214: predicted='passes', actual='passes'
215: predicted='passes', actual='passes'
216: predicted='passes', actual='passes'
217: predicted='passes', actual='passes'
218: predicted='fails', actual='fails'
219: predicted='fails', actual='fails'
220: predicted='fails', actual='fails'
221: predicted='passes', actual='passes'
222: predicted='passes', actual='passes'
223: predicted='passes', actual='passes'
224: predicted='passes', actual='passes'
225: predicted='passes', actual='passes'
226: predicted='passes', actual='passes'
227: predicted='fails', actual='fails'
228: predicted='fails', actual='fails'
229: predicted='fails', actual='fails'
230: predicted='passes', actual='passes'
231: predicted='passes', actual='passes'
232: predicted='passes', actual='passes'
233: predicted='passes', actual='passes'
234: predicted='fails', actual='fails'
235: predicted='passes', actual='passes'
236: predicted='passes', actual='passes'
237: predicted='passes', actual='passes'
238: predicted='passes', actual='passes'
239: predicted='passes', actual='passes'
240: predicted='passes', actual='passes'
241: predicted='passes', actual='passes'
242: predicted='fails', actual='fails'
243: predicted='passes', actual='passes'
244: predicted='passes', actual='passes'
245: predicted='fails', actual='fails'
246: predicted='passes', actual='passes'
247: predicted='passes', actual='passes'
248: predicted='fails', actual='fails'
249: predicted='passes', actual='passes'
250: predicted='passes', actual='passes'
251: predicted='passes', actual='passes'
252: predicted='fails', actual='fails'
253: predicted='passes', actual='passes'
254: predicted='passes', actual='passes'
255: predicted='passes', actual='passes'
256: predicted='passes', actual='passes'
257: predicted='passes', actual='passes'
258: predicted='fails', actual='fails'
259: predicted='passes', actual='passes'
260: predicted='passes', actual='passes'
261: predicted='fails', actual='fails'
262: predicted='passes', actual='passes'
263: predicted='passes', actual='passes'
264: predicted='passes', actual='passes'
265: predicted='fails', actual='fails'
266: predicted='passes', actual='passes'
267: predicted='passes', actual='passes'
268: predicted='fails', actual='fails'
269: predicted='fails', actual='fails'
270: predicted='passes', actual='passes'
271: predicted='passes', actual='passes'
272: predicted='passes', actual='passes'
273: predicted='passes', actual='fails'
274: predicted='passes', actual='passes'
275: predicted='fails', actual='fails'
276: predicted='passes', actual='passes'
277: predicted='passes', actual='passes'
278: predicted='fails', actual='fails'
279: predicted='fails', actual='fails'
280: predicted='passes', actual='passes'
281: predicted='fails', actual='fails'
282: predicted='passes', actual='passes'
283: predicted='passes', actual='passes'
284: predicted='passes', actual='passes'
285: predicted='fails', actual='fails'
286: predicted='fails', actual='fails'
287: predicted='passes', actual='passes'
288: predicted='fails', actual='fails'
289: predicted='passes', actual='passes'
290: predicted='fails', actual='fails'
291: predicted='passes', actual='passes'
292: predicted='passes', actual='passes'
293: predicted='passes', actual='passes'
294: predicted='fails', actual='fails'
295: predicted='fails', actual='fails'
296: predicted='passes', actual='passes'
297: predicted='passes', actual='passes'
298: predicted='passes', actual='passes'
299: predicted='fails', actual='fails'
300: predicted='fails', actual='fails'
301: predicted='fails', actual='fails'
302: predicted='fails', actual='fails'
303: predicted='passes', actual='passes'
304: predicted='fails', actual='fails'
305: predicted='passes', actual='passes'
306: predicted='fails', actual='fails'
307: predicted='passes', actual='passes'
308: predicted='fails', actual='fails'
309: predicted='passes', actual='passes'
310: predicted='fails', actual='fails'
311: predicted='passes', actual='passes'
312: predicted='passes', actual='passes'
313: predicted='fails', actual='fails'
314: predicted='passes', actual='passes'
315: predicted='passes', actual='passes'
316: predicted='passes', actual='passes'
317: predicted='passes', actual='passes'
318: predicted='fails', actual='fails'
319: predicted='passes', actual='passes'
320: predicted='passes', actual='passes'
321: predicted='fails', actual='fails'
322: predicted='passes', actual='passes'
323: predicted='fails', actual='fails'
324: predicted='passes', actual='passes'
325: predicted='passes', actual='passes'
326: predicted='passes', actual='passes'
327: predicted='fails', actual='fails'
328: predicted='fails', actual='fails'
329: predicted='fails', actual='fails'
330: predicted='fails', actual='fails'
331: predicted='passes', actual='passes'
332: predicted='fails', actual='fails'
Accuracy: 97.8978978978979%